package com.kdtech.analyse.Bbs;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;


public class TiexueBbsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		boolean bRet = false;
		//http://bbs.tiexue.net/post_5305514_1.html
		//http://bbs.tiexue.net/post2_6438230_1.html
		String[] regex = { "http://bbs.tiexue.net/post[0-9]*_[0-9]*_1.html",
				"http://bbs.tiexue.net/bbs[0-9]*-[0-9]*-1.html",
				"http://www.tiexue.net/ShowClass_[0-9]*_1.html",
				"http://bbs.tiexue.net/default.htm?ListUrl=http://bbs.tiexue.net/index.htm"
				//http://bbs.tiexue.net/post2_9326055_1.html
		};
		for (int i = 0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		if (!isDetailPage(urlMeta.getUrl())) {
		}
		NewsMeta bbs = new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();

		bbs.setUrl(url);
		String title = "";
		String content = null;
		Long date = null;
		Integer clickNum = 0;
		Integer commentNum = 0;

		Document doc = Jsoup.parse(htmltxt);
		title = doc.select(".bbsPosTit h1").text();
		if(StringUtils.isBlank(title)){
			title = doc.select("div.contentLeft div.wordwz h1").text();
		}
		if(StringUtils.isBlank(title)){
			title=doc.select("div#post_Content div.theme div#themer ul.content li h1").text();
			if(StringUtils.isBlank(title)){
				title=doc.select("div#allDiv div.new div.new_left div.new_con h1").text();
				if(StringUtils.isBlank(title)){
					title=doc.select("div.LeftItem div.newcon ul li.newconli1 table tbody tr td h1").text();
					if(StringUtils.isBlank(title)){
						title=doc.select("div#topbutton_4679375 div.main div.main_right div.main_body div.main_right_content ul h1").text();
					}
				}
			}
		}

		content = HtmlCleaner.getContentHtml(url,  doc.select("div#postContent p"));
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url, doc.select("div#themer ul.content li div"));
		}
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,  doc.select("div.new div.new_left div.new_con div.text div.text2 div.newconli2"));
		}
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,  doc.select("div.main1 div.LeftItem div.newcon div.newconli2"));
		}
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url, doc.select("div.main_right_content ul li.main_tt"));
		}

		date = DateUtils.matchDate(doc.select("div.date").text());
		if(date==null){
			date = DateUtils.matchDate(doc.select("div#post_Content.con div.theme div.gray").text());
			if(date==null){
				date = DateUtils.matchDate(doc.select("html body div#allDiv.con div.new div.new_left div.new_con div.user").text());
				if(date==null){
					date = DateUtils.matchDate(doc.select("html body div.main div.main1 div.LeftItem div.newcon ul li.newconli3 div.newconli3l").text());
					if(date==null){
						date = DateUtils.matchDate(doc.select("html body div#mainFrame.div_all div.bbs_topbutton div.main div.main_date div.main_date_left").text());
						if(date==null){
							date = DateUtils.matchDate(doc.select("div.subMenu span.time").text());
							if(date==null){
								date = DateUtils.matchDate(doc.select("div.bar_1 script").outerHtml());
							}
						}
					}
				}
			}

		}


		if(content.startsWith(title)){
			content = content.substring(title.length());
		}


		String text = doc.select("div.postContent div.postTit p.float_R").text();
		text = text.replace("共", "").replace("个阅读者", "").replace(" ", "").replace("&nbsp;", "");
		if(StringUtils.isBlank(text)){
			Elements select = doc.select("html body div.box div#post_Content.con div#tit.tit div.titr p");
			if(select.size()>0){
				text =select.get(0).ownText().replace("共", "").replace("个阅读者", "").replace("&nbsp;", "").replace(" ", "").replace("[]", "").trim();;
			}

		}
		if(StringUtils.isBlank(text)){
			text= doc.select("html body div#allDiv.con div.new div.new_left div.new_con div.user dl dd div.usdiv2").text();
			if(StringUtils.isBlank(text)){
				text= doc.select("html body div.main div.main1 div.LeftItem div.newcon ul li.newconli3 div.newconli3r div.pldiv font:eq(3)").text();
				text = text.replace("浏览", "").replace("IP", "");
				if(StringUtils.isBlank(text)){
					Elements select = doc.select("html body div#mainFrame.div_all div.bbs_topbutton div.Topics div.topics_right");
					if(select.size()>0){
						text = select.get(0).ownText().replace("共", "").replace("个阅读者", "").replace("&nbsp;", "").replace(" ", "").trim();
					}
				}
			}
		}
		if(StringUtils.isBlank(text)){
			text= doc.select("html body div.main div.contentLeft div.wordwz div.subMenu span.visits_num a.ico").text();
		}
		try {
			clickNum = Integer.parseInt(text);
		} catch (NumberFormatException e) {
		}
		String text2 = doc.select("html body div.container div.mLeft div.main div.mRow_1 div.page p span").text();
		if(StringUtils.isBlank(text2)){
			text2 = doc.select("html body div.box div#post_Content.con div#difanye.difanye span div.difydiv strong").text();
			if(StringUtils.isBlank(text2)){
				text2 = doc.select("html body div#allDiv.con div.new div.new_left div.new_con div.user dl dd div.usdiv1").text();
				if(StringUtils.isBlank(text2)){
					text2= doc.select("html body div.main div.main1 div.LeftItem div.newcon ul li.newconli3 div.newconli3r div.pldiv font:eq(1)").text();
					text2 = text2.replace("共", "").replace("条评论", "");
					if(StringUtils.isBlank(text2)){
						text2= doc.select("html body div#mainFrame.div_all div#topbutton_4679375.bbs_topbutton div.div_bottom_fenye strong").text();
					}
				}

			}
			int pos = text2.lastIndexOf(" ");
			if(pos!=-1){
				text2=text2.substring(pos+1).trim();
			}
			commentNum = NumberUtils.parseInt(text2);
		}

		if(StringUtils.isBlank(text2)){
			text2= doc.select("html body div.main div.contentLeft div.wordwz div.subMenu span.reply_num a.ico").text();
			commentNum = NumberUtils.parseInt(text2);
		}

		if(StringUtils.isBlank(title)){
			title = doc.select("title").text();
			title=title.replace("\r\n", "");
			title = StringUtils.substringBefore(title, "-");
		}
		Elements select = doc.select("html body div.container div.main div.mRow_1 div.page p span");
		if(select.size()==3){
			commentNum = NumberUtils.parseInt(select.get(2).text());
		}
		if(date==null){
			date=DateUtils.matchDate(doc.select("div.bttada").text());
		}
		bbs.setTitle(title);
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,  doc.select("hd_xztk"));
		}
		bbs.setContent(content);
		bbs.setDate(date);
		/**
		 * 解析用于更新的地址
		 */
		bbs.setUpdateUrl(url);

		bbs.setClickNum(clickNum);
		bbs.setCommentNum(commentNum);

		String author=doc.select("html body div.main div.contentLeft div.wordwz div.subMenu span.posting_user a").text();
		if(StringUtils.isBlank(author)){
			author = doc.select("span.fatieren a").text();
		}
		if(StringUtils.isBlank(author)){
			Elements select2 = doc.select("html body div.container div.main div.postContent div.postStart div.sideLeft ul.userMsg li.userName");
			if(select2!=null && select2.size()!=0){
				author=select2.get(0).select("p.down_icon").text();
			}
		}
		bbs.setAuthor(author);
		return bbs;
	}


	
	public NewsMeta Update(NewsMeta meta) {
		String url = meta.getUpdateUrl();
		if (StringUtils.isNotBlank(url)){
			UrlMeta urlMeta = CrawlHTML.responseToURL(url);
			return parserHtml(urlMeta);
		}
		return null;
	}

	public static void main(String[] args) {

		TiexueBbsAnalyse a = new TiexueBbsAnalyse();

		String url = "http://bbs.tiexue.net/post2_9326055_1.html";
		System.out.println(DoMainUtils.GetDomainName(url));
		if (!a.isDetailPage(url)) {
			System.out.println("不匹配规则");
		} else {
			UrlMeta meta = CrawlHTML.responseToURL(url);
			NewsMeta parserHtml = a.parserHtml(meta);
			System.out.println(parserHtml);
//			System.out.println(a.Update(parserHtml));
		}

	}

	
}
